import re
import requests
import json
from bs4 import BeautifulSoup
header = {'user-agent':'chrome'}

def number(a):
	m = re.search(r'\d+', a.string)		
	return int(m.group())

mulu = open('cnhistory.html', 'r', encoding='utf-8')
soup = BeautifulSoup(mulu)
items = soup.find_all('a', class_='title')
items.sort(key=number)
urls = []
for a in items:		
	urls.append('https://www.jianshu.com'+a['href'])
    # print(a['href'], a.string)
# print(urls)

filename = 'cnhistory'
def download(url, index):    
    header = {'user-agent':'chrome'}
    req = requests.get(url, headers=header)
    f = open('cnhistory/{}{}.html'.format(filename, index), 'w', encoding='utf-8')
    f.write( req.text )
    f.close()
    soup = BeautifulSoup(req.text)
    article = soup.find('article')
    p = article.find_all('p')
    rtn = {}
    name = ''
    for child in p[0].children:
        name += child.string
    rtn['name'] = re.sub(r'\s', '', name)
    chapter = []
    for i in range(1, len(p)):
        if not p[i].string: continue
        chapter.append( re.sub(r'\s', '', p[i].string) )
    rtn['chapter'] = chapter
    return rtn
	
def main(urls):
    # urls = ['https://www.jianshu.com/p/598ef8b26bf1', 'https://www.jianshu.com/p/2161ab87f56a']
    for i,url in enumerate(urls):
        content = download(url, i+1)
        print(content['name'])
        f = open('cnhistory/{}{}.txt'.format(filename, i+1), 'w', encoding='utf-8')
        f.write(json.dumps(content))
        f.close()
		
main(urls)